This document is part of the Explorary Data Analysis (EDA) for a kaggle project aiming to improve the “Zestimates” (home value) prediction algorithm used by Zillow. I loaded the raw data from the following project page, and conducted data visualization to facilitate model fitting
rm(list=ls())
library(lattice)
library(ggplot2)
library(rmutil)
library(corrplot)
library(leaflet)
library(plotly)
library(GGally)
library(knitr)
library(shiny)
train <- read.csv("train_property.csv", stringsAsFactors = F)
Generate time related features
train$trans_year <- sapply(strsplit(train$transactiondate, '-'), '[', 1)
train$trans_month <- sapply(strsplit(train$transactiondate, '-'), '[', 2)
train$trans_day <- sapply(strsplit(train$transactiondate, '-'), '[', 3)
train$trans_weekday <- weekdays(as.Date(train$transactiondate))
train$trans_DATE <- as.Date(train$transactiondate)
train <- plyr::rename(train,
c("parcelid"="id_parcel",
"transactiondate" = "trans_date",
"yearbuilt" = "build_year",
"basementsqft"="area_base_living",
"yardbuildingsqft17"="area_patio",
"yardbuildingsqft26"="area_shed",
"poolsizesum"="area_pool",
"lotsizesquarefeet"="area_lot",
"garagetotalsqft"="area_garage",
"finishedfloor1squarefeet" = "area_firstfloor_finished",
"calculatedfinishedsquarefeet" = "area_total_calc",
"finishedsquarefeet6" = "area_base",
"finishedsquarefeet12" = "area_live_finished",
"finishedsquarefeet13" = "area_liveperi_finished",
"finishedsquarefeet15" = "area_total_finished",
"finishedsquarefeet50" = "area_unknown",
"unitcnt" = "num_unit",
"numberofstories" = "num_story",
"roomcnt" = "num_room",
"bathroomcnt" = "num_bathroom",
"bedroomcnt" = "num_bedroom",
"calculatedbathnbr" = "num_bathroom_calc",
"fullbathcnt" = "num_bath",
"threequarterbathnbr" = "num_75_bath",
"fireplacecnt" = "num_fireplace",
"poolcnt" = "num_pool",
"garagecarcnt" = "num_garage",
"regionidcounty" = "region_county",
"regionidcity" = "region_city",
"regionidzip" = "region_zip",
"regionidneighborhood" = "region_neighbor",
"taxvaluedollarcnt" = "tax_total",
"structuretaxvaluedollarcnt" = "tax_building",
"landtaxvaluedollarcnt" = "tax_land",
"taxamount" = "tax_property",
"assessmentyear" = "tax_year",
"taxdelinquencyflag" = "tax_delinquency",
"taxdelinquencyyear" = "tax_delinquency_year",
"propertyzoningdesc" = "zoning_property",
"propertylandusetypeid" = "zoning_landuse",
"propertycountylandusecode" = "zoning_landuse_county",
"fireplaceflag" = "flag_fireplace",
"hashottuborspa" = "flag_tub",
"buildingqualitytypeid" = "quality",
"buildingclasstypeid" = "framing",
"typeconstructiontypeid" = "material",
"decktypeid" = "deck",
"storytypeid" = "story",
"heatingorsystemtypeid" = "heating",
"airconditioningtypeid" = "aircon",
"architecturalstyletypeid" = "architectural_style",
"pooltypeid10" = "flag_spa",
"pooltypeid2" = "flag_pool_spa",
"pooltypeid7" = "flag_pool_tub",
"fips"="county"))
# numerical variable
variable_numeric = c("area_firstfloor_finished",
"area_base", "area_base_living",
"area_garage",
"area_live_finished",
"area_liveperi_finished",
"area_lot",
"area_patio",
"area_pool",
"area_shed",
"area_total_calc",
"area_total_finished",
"area_unknown",
"tax_building",
"tax_land",
"tax_property",
"tax_total",
"latitude",
"longitude")
# discrete
variable_discrete = c("num_75_bath",
"num_bath",
"num_bathroom",
"num_bathroom_calc",
"num_bedroom",
"num_fireplace",
"num_garage",
"num_pool",
"num_room",
"num_story",
"num_unit")
variable_binary = c("flag_fireplace",
"flag_tub",
"flag_spa",
"flag_pool_spa",
"flag_pool_tub",
"tax_delinquency")
# categorical variable
variable_nominal = c("aircon",
"architectural_style",
"county",
"deck",
"framing",
"heating",
"id_parcel",
"material",
"region_city",
"region_county",
"region_neighbor",
"region_zip",
"story",
"zoning_landuse",
"zoning_landuse_county")
variable_ordinal = c("quality")
# date
variable_date = c("tax_year",
"build_year",
"tax_delinquency_year",
"trans_year",
"trans_month",
"trans_day",
"trans_date",
"trans_weekday")
# others
variable_unstruct = c("zoning_property")
# don't understand
variable_unknown = c('censustractandblock',
'rawcensustractandblock')
# convert binary to 0, 1
train[train$flag_fireplace == "", "flag_fireplace"] = 0
train[train$flag_fireplace == "true", "flag_fireplace"] = 1
train[train$flag_tub == "", "flag_tub"] = 0
train[train$flag_tub == "true", "flag_tub"] = 1
train[train$tax_delinquency == "", "tax_delinquency"] = 0
train[train$tax_delinquency == "Y", "tax_delinquency"] = 1
# convert to date to int
train[,variable_date] = sapply(train[,variable_date], as.character)
# convert to numeric to double
train[,variable_numeric] = sapply(train[,variable_numeric], as.numeric)
# convert to discrete to int
train[,c(variable_discrete, variable_binary)] = sapply(train[,c(variable_discrete, variable_binary)], as.integer)
# convert to categorical to character
train[,c(variable_nominal, variable_ordinal)] = sapply(train[,c(variable_nominal, variable_ordinal)], as.character)
par(mfrow=c(1,2)) #plot in 1 figure with 2 subplot(1 row, 2 col)
barplot(sort(table(train$county))) # sorted barplot
barplot(sort(table(train$region_county)))
df = train[!is.na(train$num_bathroom_calc),
c("num_bathroom","num_bathroom_calc")]
ggplot(data = df,
aes(x=num_bathroom,
y=num_bathroom_calc)) + geom_point() + theme_minimal()
sum(df$num_bathroom == df$num_bathroom_calc) == nrow(df)
## [1] TRUE
df = train[!is.na(train$num_bath),
c("num_bathroom","num_bath")]
ggplot(data = df,
aes(x=num_bath,
y=num_bathroom)) + geom_point() + theme_minimal()
sum(df$num_bath == df$num_bathroom)/nrow(df)
## [1] 0.9989113
num.NA <- sort(colSums(sapply(train, is.na)))
dfnum.NA <- data.frame(ind = c(1:length(num.NA)),
percentage = num.NA/nrow(train),
per80 = num.NA/nrow(train)>=0.2,
name = names(num.NA),
row.names = NULL) # convert to data.frame
ggplot(data = dfnum.NA, aes(x=ind, y=percentage)) +
geom_bar(aes(fill=per80), stat="identity") +
scale_x_discrete(name ="column names",
limits=dfnum.NA$name)+
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none") +
geom_hline(yintercept = 0.2) +
ggtitle("Percentage of Missing")
# overall missing similarity
MissSim <- function(X, feature){
na.df <- as.data.frame(is.na(X)) # convert to na.df
# Jaccard dist: 0-1, 0 means similar, 1 means not similar
na.similarity <- as.data.frame(as.matrix(proxy::dist(t(na.df),
method = "jaccard",
diag = T, upper = T))) # calculate similarity between columns
na.mat = as.matrix(na.similarity[order(na.similarity[feature,]), order(na.similarity[feature,])]) # ordered na.similarity, then convert to matrix
plot_ly(x=colnames(na.mat),
y=rownames(na.mat),
z = na.mat,
type = "heatmap")
}
MissSim(train, "area_base")
It seams some features are not missing randomly +missing value of binary +missing value of discrete +missing value of nominal +missing value of numeric
# missing value of binary
MissSim(train[, variable_binary], "flag_fireplace")
# missing value of discrete
MissSim(train[, variable_discrete], "num_story")
# missing value of nominal
MissSim(train[, variable_nominal], "story")
# missing value of numeric
MissSim(train[, variable_numeric], "area_base")
ggplot(data = train, aes(x=logerror)) +
geom_histogram(aes(y=..count../sum(..count..)),
bins = 400,
fill="red",
color="black") +
theme_minimal()+
theme(axis.title = element_text(size=16),
axis.text = element_text(size=14))+
ylab("Count")+coord_cartesian(x=c(-0.5,0.5))
ggplot(train, aes(sample = logerror), shape = 21, alpha = 0.8) +
stat_qq() + theme_minimal()
logerror_ecdf = ecdf(train$logerror)
x = sort(train$logerror)
cdfl = plaplace(x, m = mean(train$logerror) ,s=sd(train$logerror)*0.4)
qq = data.frame(cdfl = cdfl, ecdf = logerror_ecdf(x))
ggplot(data = qq, aes(x=cdfl, y=ecdf)) + geom_line() + theme_minimal()
train$abs_logerror <- abs(train$logerror)
ggplot(data=train, aes(x=abs_logerror)) +
geom_histogram(bins=400,
fill="red", color="black")+
theme_bw()+theme(axis.title = element_text(size=16),
axis.text = element_text(size=14))+
ylab("Count")+coord_cartesian(x=c(0,0.5))
abs_error.DATE = by(train, train$trans_DATE, function(x){return(mean(x$abs_logerror))})
error.DATE = by(train, train$trans_DATE, function(x){return(mean(x$logerror))})
par(mfrow=c(3,1))
plot(abs_error.DATE, type="b") + title("date v.s abs_logerror")
## integer(0)
plot(error.DATE, type="b") + title("date v.s logerror")
## integer(0)
barplot(table(train$trans_DATE)) + title("histogram of date")
## numeric(0)
```
abs_error.month = by(train, train$trans_month, function(x){return(mean(x$abs_logerror))})
error.month = by(train, train$trans_month, function(x){return(mean(x$logerror))})
par(mfrow=c(3,1))
plot(abs_error.month, type="b") + title("month v.s abs_logerror")
## integer(0)
plot(error.month, type="b") + title("month v.s logerror")
## integer(0)
barplot(table(train$trans_month)) + title("histogram of month")
## numeric(0)
abs_error.day = by(train, train$trans_day, function(x){return(mean(x$abs_logerror))})
error.day = by(train, train$trans_day, function(x){return(mean(x$logerror))})
par(mfrow=c(3,1))
plot(abs_error.day, type="b") + title("day v.s abs_logerror")
## integer(0)
plot(error.day, type="b") + title("day v.s logerror")
## integer(0)
barplot(table(train$trans_day)) + title("histogram of day")
## numeric(0)
abs_error.weekday = by(train, train$trans_weekday, function(x){return(mean(x$abs_logerror))})
error.weekday = by(train, train$trans_weekday, function(x){return(mean(x$logerror))})
par(mfrow=c(3,1))
plot(sort(abs_error.weekday), type="b") + title("weekday v.s abs_logerror")
## integer(0)
plot(sort(error.weekday), type="b") + title("weekday v.s logerror")
## integer(0)
barplot(sort(table(train$trans_weekday))) + title("histogram of weekday")
## numeric(0)
abs_error.build_year = by(train, train$build_year, function(x){return(mean(x$abs_logerror))})
error.build_year = by(train, train$build_year, function(x){return(mean(x$logerror))})
err.build_year = data.frame(year = as.integer(sort(unique(train$build_year))),
abs_logerror = as.numeric(abs_error.build_year),
logerror = as.numeric(error.build_year))
ggplot(data=err.build_year, aes(x = year, y = abs_logerror)) +
geom_point() + geom_line() + geom_smooth() + geom_hline(yintercept = mean(train$abs_logerror), color="red")
ggplot(data=err.build_year, aes(x = year, y = logerror)) +
geom_point() + geom_line() + geom_smooth() + coord_cartesian(ylim=c(-0.1,0.1))+geom_hline(yintercept = mean(train$logerror), color="red")
par(mfrow=c(1,1))
remain.col <- names(num.NA)[which(num.NA <= 0.8 * dim(train)[1])] # trainT = train
remain.numeric <- remain.col[which(remain.col %in% variable_numeric)]
remain.discrete <- remain.col[which(remain.col %in% variable_discrete)]
train.remain <- train[, c("abs_logerror",remain.numeric)]
corrplot(cor(train.remain, use = "complete.obs"), type="lower") # numeric
train.remain <- train[, c("abs_logerror",remain.discrete)]
corrplot(cor(train.remain, use = "complete.obs"), type="lower") # discrete
train.remain <- train[, c("abs_logerror",remain.discrete)]
corrplot(cor(train.remain, use = "complete.obs"), type="lower") # discrete
train.remain <- train[, c("logerror",remain.numeric)]
corrplot(cor(train.remain, use = "complete.obs"), type="lower") # numeric
train.remain <- train[, c("logerror",remain.discrete)]
corrplot(cor(train.remain, use = "complete.obs"), type="lower") # discrete
train.remain <- train[, c("logerror",c(remain.discrete, remain.numeric))]
corrplot(cor(train.remain, use = "complete.obs"), type="lower") # discrete
df = train[, c("num_bath", "num_bedroom", "num_room")]
df.dropna = df[complete.cases(df),]
ggpairs(df.dropna,
method = "square",
tl.cex = 1,
type = 'lower')
df = train[,c("quality","logerror")]
df$quality = as.integer(df$quality)
ggplot(data=df, aes(x=as.factor(quality), y=logerror)) +
geom_jitter(alpha=0.3, color='lightgrey') +
geom_boxplot(outlier.color=NA, color='steelblue') +
labs(x='quality')
# map on large abs_logerr
lower = quantile(train$abs_logerror, 0.1)
upper = quantile(train$abs_logerror, 0.9)
well_estimate = train[((train$abs_logerror < upper) & (train$abs_logerror > lower)),
c("longitude", "latitude", "abs_logerror")]
bad_estimate = train[((train$abs_logerror > upper) | (train$abs_logerror < lower)),
c("longitude", "latitude", "abs_logerror")]
over_est = quantile(train$logerror, 0.99)
over_estimate = train[(train$logerror > upper),
c("longitude", "latitude", "logerror")]
over_estimate$type = "over"
under_est = quantile(train$logerror, 0.01)
under_estimate = train[(train$abs_logerror < lower),
c("longitude", "latitude", "logerror")]
under_estimate$type = "under"
pal <- colorFactor(c("red","navy"), domain = c("over", "under"))
rbind(over_estimate, under_estimate) %>%
leaflet() %>%
addTiles() %>%
addCircleMarkers(lng=~longitude/10e5,
lat=~latitude/10e5,
color = ~pal(type),
radius = 3,
stroke = FALSE, fillOpacity = 0.3)
mapRegion = function(X, ub, lb){
over_est = quantile(train$logerror, ub)
over_estimate = X[(X$logerror > over_est),
c("longitude", "latitude", "logerror")]
over_estimate$type = "over"
under_est = quantile(train$logerror, lb)
under_estimate = X[(X$abs_logerror < under_est),
c("longitude", "latitude", "logerror")]
under_estimate$type = "under"
pal <- colorFactor(c("red", "navy"), domain = c("over", "under"))
rbind(over_estimate, under_estimate) %>%
leaflet() %>%
addTiles() %>%
addCircleMarkers(lng=~longitude/10e5,
lat=~latitude/10e5,
color = ~pal(type),
radius = 5,
stroke = FALSE, fillOpacity = 0.5)
}
region = train[!is.na(train$region_city)&(train$region_city == "5534"),]
mapRegion(region, 0.9, 0.5)